%matplotlib inline
#python includes
import sys
#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization
#bootstrapping on confidence interval on the difference in means:
iters = 1000 #iterations
N1 = 500 #observations
N2 = 100 #observations
#pretend this data was real:
X1 = pd.Series(np.random.normal(100, 1., N1))
X2 = pd.Series(np.random.normal(99, 0.5, N2))
X1.hist(bins = N1/10, alpha=.7)
X2.hist(bins = N2/10, alpha=.5)
#plt.show()
#compute original difference
origDiff = X1.mean() - X2.mean()
print "original diff:", origDiff
#sys.exit(1)
reDiffs = list()
for i in range(iters):
#draw a random resampling of #drawing from the hat:
reX1indices = [int(d) for d in np.random.uniform(0, N1-1, N1)]
reX2indices = [int(d) for d in np.random.uniform(0, N2-1, N2)]
#print reX1indices
#sys.exit(1)
reX1 = X1[reX1indices] #resampled X1
reX2 = X2[reX2indices] #resampled X2
#reX1.hist(bins = N1/10, alpha=.3)
reDiff = reX1.mean() - reX2.mean()
#print "resampled diff", reDiff
reDiffs.append(reDiff)
plt.show()
sortedReDiffs = pd.Series(sorted(reDiffs))
#print sortedReDiffs.head()
sortedReDiffs.hist(bins = iters/6)
print "histogram of bootstrapped diffs"
sRDdesc = sortedReDiffs.describe(percentiles=[.025, .975])
plt.show()
print "The difference in means is %.4f with 95%% CI: [%.4f, %.4f]" % (origDiff,sRDdesc['2.5%'], sRDdesc['97.5%'])
(m is number of features)
import statsmodels.api as sm#load the iris data:
iris = pd.read_csv('iris.csv')
print iris.head()
print iris.describe()
#example: predicting Sepal Length from the other iris characteristics:
iris_train = iris[:130] #training data
iris_test = iris[130:150] #testing data
model = sm.OLS(iris_train['SepalLength'], iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']]).fit()
y_hat = np.dot(iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']], model.params) #model.params are betas...
#y true, y predicted, error
print np.array([iris_test['SepalLength'], y_hat, iris_test['SepalLength'] - y_hat]).T
print "mean squared error: %.3f" % np.mean((iris_test['SepalLength'] - y_hat)**2)
#an example of overfitting by using too many features compared to training observations:
#Let's setup a train and test (no development because we aren't setting any extra parameters)
iris_train = iris[:10] #training data
iris_test = iris[100:] #testing data
y_train = iris_train['SepalLength'] #from now on, lowercase => vector
X_3feats = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_2feats = iris_train[['PetalWidth','SepalWidth']] #uppercase => matrix
#Train (fit) the models:
lr_result_3f = sm.OLS(y_train, X_3feats).fit()
lr_result_2f = sm.OLS(y_train, X_2feats).fit()
betas_3f = lr_result_3f.params
print "\nbetas for 3 features:\n", betas_3f
betas_2f = lr_result_2f.params
print "\nbetas for 2 features:\n", betas_2f
#setup test
X_test_3f = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_test_2f = iris_test[['PetalWidth','SepalWidth']] #uppercase => matrix
y_test = iris_test['SepalLength']
#apply model to test and check eror:
y_hat_test_3f = np.dot(X_test_3f, betas_3f)
y_hat_train_3f = np.dot(X_3feats, betas_3f)
y_hat_test_2f = np.dot(X_test_2f, betas_2f)
error_3f = np.mean((y_test - y_hat_test_3f)**2)
error_3ftrain = np.mean((y_train - y_hat_train_3f)**2)
error_2f = np.mean((y_test - y_hat_test_2f)**2)
print "\nMSerror when using 3 feats(test): %.2f" % error_3f
print "MSerror when using 3 feats(train): %.2f" % error_3ftrain
print "MSerror when using 2 feats: %.2f" % error_2f
#Regularization Comparison
from sklearn.linear_model import Ridge, Lasso
#RIDGE:
weights = []
mses = []
#alphas = ([.001, .01, .1, 1, 10]
alphas = [(2**i)/10000.0 for i in xrange(20)]
print "standard deviation of y_hats as alpha increases"
for alpha in alphas:
model = Ridge(alpha=alpha)
model.fit(X_3feats, y_train)
weights.append(model.coef_)
#print model.coef_
#figure out accuracy:
y_hat = model.predict(X_test_3f)
print y_hat.std()
mses.append(np.sum((y_test - y_hat)**2))
plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)#
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
#LASSO:
weights = []
mses = []
alphas = alphas[:13]
for alpha in alphas:
model = Lasso(alpha=alpha)
model.fit(X_3feats, y_train)
weights.append(model.coef_)
y_hat = model.predict(X_test_3f)
mses.append(np.sum((y_test - y_hat)**2))
plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
#re-setup data
y_train = iris_train['SepalLength'] #from now on, lowercase => vector
X_3feats = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_test_3f = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
y_test = iris_test['SepalLength']
#turn y into a classification problem:
split_point = y_train.mean()
y_train = [1 if i else 0 for i in y_train > split_point]
y_test = [1 if i else 0 for i in y_test > split_point]
print y_train
plt.scatter(X_3feats[['SepalWidth']], y_train)
plt.xlabel('SepalWidth')
plt.ylabel('Is Long?')
from sklearn.linear_model import LogisticRegression
#L2:
weights = []
mses = []
#alphas = np.array([.1, 1, 10, 100, 1000, 10000, 100000])
alphas = [.1*3**i for i in xrange(14)]
Cs = alphas[::-1] #reverse
for c in Cs:
model = LogisticRegression(penalty='l2', C=c)
model.fit(X_3feats, y_train)
weights.append(model.coef_[0])
#print model.coef_
#figure out accuracy:
y_hat = model.predict(X_test_3f)
mses.append(np.sum((y_test - y_hat)**2))
#print weights
plt.plot([[i]*len(weights[0]) for i in xrange(len(Cs))], weights)#
plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
plt.show()
# plt.plot([i for i in xrange(len(Cs))], mses)
#plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
# plt.show()
#L1
weights = []
mses = []
Cs = Cs[3:-2]
for c in Cs:
model = LogisticRegression(penalty='l1', C=c)
model.fit(X_3feats, y_train)
weights.append(model.coef_[0])
#print model.coef_
#figure out accuracy:
y_hat = model.predict(X_test_3f)
mses.append(np.sum((y_test - y_hat)**2))
#print weights
plt.plot([[i]*len(weights[0]) for i in xrange(len(Cs))], weights)#
plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
plt.show()
#setup data:
X = iris[['PetalWidth','SepalWidth']] #'SepalLength', 'PetalLength'
X.plot(kind='scatter', x = 'PetalWidth', y ='SepalWidth', alpha=.3)
plt.show()
#run kmeans using sklearn:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2)
model.fit(X) #run KMeans
clusters = model.predict(X)
print "Clusters assigned", clusters
#plot the results
plt.scatter(X['PetalWidth'], X['SepalWidth'], c = clusters, alpha=.4) #color-coded points
centers = model.cluster_centers_
#mark the centers
plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=5, color='teal', alpha=.6)
#setup data (could use same as above but changing it up for variety)
X = iris[['SepalLength', 'PetalWidth']]
#X = iris[['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']]
X = (X - X.mean()) #mean center the data
X.plot(kind='scatter', x ='SepalLength', y = 'PetalWidth', alpha=.4)
#sys.exit(1)
#Run PCA, using sklearn:
from sklearn.decomposition import PCA
model = PCA(n_components = 2)
model.fit(X)
#let's look at the components (directions)
V = model.components_ #components = directions
print "v martrix:\n", V
plt.scatter([V[0][0]], [V[0][1]], c = [9])
#sys.exit(1)
#draw each direction:
for c in V:
slope = c[1]/c[0]
xpoints = np.linspace(float(X[[0]].min()), float(X[[0]].max()), 10)
UDc = [xp*slope for xp in xpoints] #the pricipal component for the given direction
plt.plot(xpoints, UDc)
#plt.ylim([-3,3])
#plt.xlim([-3,3])
plt.show()
#sys.exit(1)
#let's apply the transform based on v:
X_lowdim = model.transform(X)
X_lowdim2 = np.dot(X,V.T) #same as above
print V
print X.head()
print X_lowdim[:5]
print X_lowdim2[:5]
Xld = pd.Series(X_lowdim[:,0])
Xld.hist()
plt.scatter(Xld, [-1]*len(Xld), alpha = 0.3)
plt.show()
#sys.exit(1)
#explained variance
print "percentage variance explained:" , model.explained_variance_ratio_
var = X_lowdim.std(axis=0)**2
print "variance per component ", var
print "our percentage variance explained", [vi / sum(var) for vi in var]
#setup class names to be flower names
print iris[:3]
names = list(set(iris['Name'].tolist()))
nameToIndex = dict([(names[i],i) for i in range(len(names))])
print nameToIndex
#setup X and y:
iris = iris.iloc[np.random.permutation(len(iris))]
iris_train = iris[:100] #training data
iris_test = iris[100:] #testing data
Xtrain = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth','SepalLength']]
#Xtrain = iris_train[['SepalWidth']]
Xtest = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth','SepalLength']]
#Xtest = iris_test[['SepalWidth']]
ytrain = np.array([nameToIndex[n] for n in iris_train['Name'].tolist()])
ytest = np.array([nameToIndex[n] for n in iris_test['Name'].tolist()])
Xtrain[['SepalWidth']].hist()
plt.show()
#setup
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(Xtrain, ytrain)
print "Class priors: ", clf.class_prior_
print "\nClass means: \n", clf.theta_
print "\nClass variance: \n", clf.sigma_
#lets just look at one test example:
oneExample = Xtest.iloc[6]
print oneExample
print "\npredicted class: ", clf.predict([oneExample])
print "\nprobabilities for each class:", ["%.6f"%p for p in clf.predict_proba([oneExample])[0]]
#check accuracy for all predictions
y_hat = clf.predict(Xtest)
accuracy = sum([1 if y_hat[i] == ytest[i] else 0 for i in range(len(ytest))]) / float(len(ytest))
print accuracy